{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 去除重复项"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas  as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     sh4721209\n",
       "1     sh4707812\n",
       "2     sh4706115\n",
       "3     sh4696934\n",
       "4     sh4683880\n",
       "5     sh4683135\n",
       "6     sh4672166\n",
       "7     sh4747142\n",
       "8     sh4361293\n",
       "9     sh4753133\n",
       "10    sh4745920\n",
       "11    sh4698624\n",
       "12    sh4659106\n",
       "13    sh4644111\n",
       "14    sh4593293\n",
       "15    sh4330138\n",
       "16    sh4790380\n",
       "17    sh4678694\n",
       "18    sh4648294\n",
       "19    sh4594857\n",
       "20    sh4787136\n",
       "21    sh4683880\n",
       "22    sh4683135\n",
       "23    sh4672166\n",
       "24    sh4745920\n",
       "25    sh4698624\n",
       "26    sh4747142\n",
       "Name: key, dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_path=('../../lianjia/results/test.csv')\n",
    "index_df=pd.read_table(file_path,sep=',',encoding='gbk')\n",
    "index_df['key']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 判断重复项\n",
    "`DataFrame.duplicated(subset=None, keep='first')`\n",
    "\n",
    "keep : {‘first’, ‘last’, False}, default ‘first’\n",
    "\n",
    "- `first` : Mark duplicates as True except for the first occurrence.\n",
    "- `last` : Mark duplicates as True except for the last occurrence.\n",
    "- `False` : Mark all duplicates as True."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     False\n",
       "1     False\n",
       "2     False\n",
       "3     False\n",
       "4     False\n",
       "5     False\n",
       "6     False\n",
       "7     False\n",
       "8     False\n",
       "9     False\n",
       "10    False\n",
       "11    False\n",
       "12    False\n",
       "13    False\n",
       "14    False\n",
       "15    False\n",
       "16    False\n",
       "17    False\n",
       "18    False\n",
       "19    False\n",
       "20    False\n",
       "21     True\n",
       "22     True\n",
       "23     True\n",
       "24     True\n",
       "25     True\n",
       "26     True\n",
       "Name: key, dtype: bool"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_df['key'].duplicated()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 去除重复项\n",
    "`DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)[source]`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "去除 `key` 列重复的项目"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     sh4721209\n",
       "1     sh4707812\n",
       "2     sh4706115\n",
       "3     sh4696934\n",
       "4     sh4683880\n",
       "5     sh4683135\n",
       "6     sh4672166\n",
       "7     sh4747142\n",
       "8     sh4361293\n",
       "9     sh4753133\n",
       "10    sh4745920\n",
       "11    sh4698624\n",
       "12    sh4659106\n",
       "13    sh4644111\n",
       "14    sh4593293\n",
       "15    sh4330138\n",
       "16    sh4790380\n",
       "17    sh4678694\n",
       "18    sh4648294\n",
       "19    sh4594857\n",
       "20    sh4787136\n",
       "Name: key, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new=index_df.drop_duplicates(['key'])\n",
    "new['key']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 参考\n",
    "1. [python pandas dataframe 去重函数](http://blog.csdn.net/xinxing__8185/article/details/48022401)\n",
    "2. [pandas.DataFrame.duplicated](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.duplicated.html)\n",
    "3. [pandas.DataFrame.drop_duplicates](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop_duplicates.html)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
